knitr::opts_chunk$set(message = FALSE)

library(tidyverse)
library(lubridate)
library(plotly)
library(patchwork)

Data input and cleaning

bakery_df =
  read_csv("./Data/Bakery_sales.csv") %>% 
  janitor::clean_names() %>% 
  mutate(
    unit_price = str_replace(unit_price, "€", ""),
    unit_price = str_replace(unit_price, ",", "."),
    unit_price = as.numeric(unit_price),
    product_name = article) %>% 
  filter(product_name != ".") %>% 
  select(-article)

bakery_df
## # A tibble: 234,000 × 7
##       x1 date       time   ticket_number quantity unit_price product_name       
##    <dbl> <date>     <time>         <dbl>    <dbl>      <dbl> <chr>              
##  1     0 2021-01-02 08:38         150040        1       0.9  BAGUETTE           
##  2     1 2021-01-02 08:38         150040        3       1.2  PAIN AU CHOCOLAT   
##  3     4 2021-01-02 09:14         150041        2       1.2  PAIN AU CHOCOLAT   
##  4     5 2021-01-02 09:14         150041        1       1.15 PAIN               
##  5     8 2021-01-02 09:25         150042        5       1.2  TRADITIONAL BAGUET…
##  6    11 2021-01-02 09:25         150043        2       0.9  BAGUETTE           
##  7    12 2021-01-02 09:25         150043        3       1.1  CROISSANT          
##  8    15 2021-01-02 09:27         150044        1       1.05 BANETTE            
##  9    18 2021-01-02 09:32         150045        3       1.2  TRADITIONAL BAGUET…
## 10    19 2021-01-02 09:32         150045        6       1.1  CROISSANT          
## # … with 233,990 more rows

A pie chart showing the percentage of each product’s sale count (top 10)

plot_pie =
  bakery_df %>% 
  group_by(product_name) %>% 
  summarize(n_obs = n()) %>% 
  arrange(desc(n_obs)) %>% 
  head(10) %>% 
  plot_ly(labels = ~product_name, values = ~n_obs, type = 'pie', colors = "viridis") %>% 
  layout(title = 'Top 10 favoriate bakeries',
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

plot_pie

top 10 favorite bread monthly sale counts in 2021

top_line =
  bakery_df %>% 
  separate(date, into = c("year", "month", "day"), sep = "-") %>% 
  mutate(year = as.numeric(year),
         month = as.integer(month),
         day = as.integer(day),) %>%
  filter(year == 2021) %>% 
  select (year, month, product_name, quantity) %>% 
  group_by (year, month, product_name) %>% 
  mutate (total_counts = sum (quantity)) %>% 
  distinct(year, month, product_name,total_counts) 

top_line1 =
  top_line %>% 
  filter(product_name %in% c("TRADITIONAL BAGUETTE")) %>% 
  ggplot(aes(x =month, y = total_counts, color = product_name)) + 
    geom_line() +
    theme(legend.position = "bottom") + 
    labs(x = "month",
         y = "counts of bread",
         title = "monthly sale counts of the most popular bread in 2021") +
    scale_x_continuous(
    breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), 
    labels = c("Jan","Feb","Mar","Apr","May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) 

top_line2 =
  top_line %>% 
  filter(product_name %in% c("COUPE", "BAGUETTE","BANETTE", "CROISSANT", "PAIN", "AU CHOCOLAT", "SPECIAL BREAD", "CEREAL BAGUETTE", "FORMULE SANDWICH", "BOULE 400G")) %>% 
  ggplot(aes(x =month, y = total_counts, color = product_name)) + 
    geom_line() +
    theme(legend.position = "bottom") + 
    labs(x = "month",
         y = "counts of bread",
         title = "top 2-10 popular bread monthly sale counts in 2021") +
    scale_x_continuous(
    breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12), 
    labels = c("Jan","Feb","Mar","Apr","May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")) 
 
top_line1+top_line2

A line plot of trends showing total sale revenue by month (trend)

plot_line =
  bakery_df %>% 
  separate(date, into = c("year", "month", "day"), sep = "-") %>% 
  mutate(year = as.numeric(year),
         month = as.integer(month),
         day = as.integer(day),) %>% 
  group_by(year, month) %>% 
  mutate(rev = quantity * unit_price) %>% 
  summarize(month_rev = sum(rev)) %>% 
  
  plot_ly(x = ~month, y = ~month_rev, mode = 'lines+markers', alpha = 0.5)

plot_line 

A scatterplot of distributions of counts for a specific bread by months from 2021 to 2022

plot_scatter =
  bakery_df %>% 
  separate(date, into = c("year", "month", "day"), sep = "-") %>% 
  mutate(year = as.numeric(year),
         month = as.integer(month),
         day = as.integer(day),) %>% 
  group_by(year, month) %>% 
  filter(product_name == "TRADITIONAL BAGUETTE") %>% 
  summarize(count = n()) %>%
  plot_ly(x = ~month, y = ~count, type = "scatter", color = ~year, mode = "markers", alpha = 0.5)
  
plot_scatter

ranking of the prices of DIVERS PATISSERIE

plot_bar =
  bakery_df %>% 
  filter(product_name %in% c("DIVERS PATISSERIE")) %>% 
  group_by(product_name) %>% 
  distinct (unit_price) %>% 
  arrange(desc(unit_price)) %>% 
  mutate(product_name = fct_reorder(product_name, unit_price)) %>% 
  ggplot(aes(x = unit_price, color = product_name)) + 
    geom_histogram(position = "dodge", binwidth = 2) + 
    theme(axis.text.x = element_text(hjust = 1)) + 
    labs(x = "unit price",
         title = "ranking of the prices of DIVERS PATISSERIE") +
    theme(legend.position = "bottom") 

plot_bar